python:使用selenium爬取51job(前程无忧)并将爬取数据存储到MySql数据库中的代码实例

您所在的位置:网站首页 python 爬虫库 selenium python:使用selenium爬取51job(前程无忧)并将爬取数据存储到MySql数据库中的代码实例

python:使用selenium爬取51job(前程无忧)并将爬取数据存储到MySql数据库中的代码实例

#python:使用selenium爬取51job(前程无忧)并将爬取数据存储到MySql数据库中的代码实例| 来源: 网络整理| 查看: 265

         自己捣鼓了几天写的代码,基本上把51job的岗位相关的数据都爬下来了,可以视要求自行增减,代码虽然有些简陋,不过我爬取的时候没报什么错。代码适合初学者学习使用,废话不多说,代码如下:

from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from selenium import webdriver from time import sleep import pymysql import re class Crawler: def __init__(self): self.wd = webdriver.Chrome() self.wd.implicitly_wait(20) self.DBHOST = "localhost" self.DBUSER = "root" self.DBPASS = "123456" self.DBNAME = "51job" # 获取当前页面的数据 def getData(self, len_Css): rows = [] for i in range(1, len_Css): # 岗位名称 job_name = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.jname.at'.format(i)).text # 公司名称 company_name = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.cname.at'.format(i)).text # 城市 工作经验 学历 招聘人数 al = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.d.at'.format(i)).text.split('|') # 分别对应不同情况,有的岗位缺少学历,有的缺少工作经验 if len(al) == 4: city = al[0] experience = al[1] education = al[2] recruits_Number = al[3] elif len(al) == 3: city = al[0] experience = al[1] education = None recruits_Number = al[2] elif len(al) == 2: city = al[0] experience = None education = None recruits_Number = al[1] else: city = None experience = None education = None recruits_Number = None # 发布日期 release_Date = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.time'.format(i)).text # 公司福利 # 有的岗位不能定位到福利元素,通过自定义NoExists方法判断能否定位到元素 # if self.NoExists('div.j_joblist > div:nth-child({0}) p.tags'.format(i)): # welfare = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) p.tags'.format(i)).get_attribute("title") # else: # welfare = None # 薪水 # 有的岗位薪水能定位到元素,但是是空串,防止报错 if bool(self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.sal'.format(i)).text): salary = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.sal'.format(i)).text else: salary = None # 公司类型 company_type = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) p.int.at'.format(i)).text # 招聘详情url job_ex_url = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.el[target=_blank]'.format(i)).get_attribute("href") # 公司url company_url = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.cname.at'.format(i)).get_attribute("href") rows.append([job_name, company_name, city, experience, education, recruits_Number, release_Date, salary, company_type, job_ex_url, company_url]) return rows # 将爬取的数据存进数据库 def saveData(self, rows): db = pymysql.connect(host=self.DBHOST, user=self.DBUSER, password=self.DBPASS, database=self.DBNAME) cur = db.cursor() sql = "INSERT INTO ods_51job_job(job_name, company_name, job_city, job_experience, job_education, recruits_Number, release_Date, salary, company_type, job_ex_url, company_url) " \ "VALUE (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" try: for row in rows: cur.execute(sql, row) db.commit() except pymysql.Error as e: print(e) finally: cur.close() db.close() # 一次爬取存储一页数据,自动递增直到爬完 def scrapingData(self, City, keyWord, start_Page): wait = WebDriverWait(self.wd, 20, 0.5) # 得出总页数 isNextpage = self.wd.find_element(By.CSS_SELECTOR, 'body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_page > div > div > div > span:nth-child(1)').text result = re.findall(r'\d+', isNextpage) condition = int(result[0]) sleep(2) print('城市编号:%s 关键词:%s 总页数:%d' % (City, keyWord, condition)) while start_Page 1: startPage = 1 # 热门城市编码 # {"北京", "010000"}, {"上海", "020000"}, {"广州", "030200"}, {"深圳", "040000"}, {"武汉", "180200"}, # {"西安", "200200"}, {"杭州", "080200"}, {"南京", "070200"}, {"成都", "090200"}, {"重庆", "060000"}, # {"东莞", "030800"}, {"大连", "230300"}, {"沈阳", "230200"}, {"苏州", "070300"}, {"昆明", "250200"}, # {"长沙", "190200"}, {"合肥", "150200"}, {"宁波", "080300"}, {"郑州", "170200"}, {"天津", "050000"}, # {"青岛", "120300"}, {"哈尔滨", "220200"}, {"长春", "240200"}, {"福州", "110200"}, {"珠三角", "01"}; if __name__ == '__main__': # 将需要爬取的城市编号和关键词放进数组,start_page为从第几页开始爬 cities = ['040000', '080200', '070200', '190200', '090200', '180200'] keyword = ['大数据', 'python', '爬虫', 'Hadoop', '数据分析师', 'Hadoop'] start_page = 1 a = Crawler() a.getUrl(cities, start_page, keyword)

         上面的代码里公司福利的数据我注释掉了,因为基本每页都有几条没有公司福利的岗位数据,处理错误耗时太久,爬取大量数据的时候太煎熬了,干脆不要了。还有就是css路径我都是直接复制的,好多都还可以再删减优化,不过我比较懒,也可以换成xpath路径,可以更精简。最后就是数据库需要自己建表,连接的时候注意改下代码里的参数还有sql里的字段名称就行,还是比较简单的。

        我自己运行代码的时候出错一般都是爬了很久后报timeout错误,可以把等待时间稍微加长点,不过估计爬多了也还会报错,毕竟51job虽然很随便但爬多了也会反爬,只是不像boss直聘爬了几千条数据就封ip两天那么狠(表示被封过好几次😤),最后就是出错了需要手动重新更改参数继续爬,有些麻烦,还能再改进,不过我懒得改了,反正估计也没多少人看,自己能用就行啦。



【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3